library(haven)
library(tm)
library(tidyverse)

#read in ketchley_wenig_CPS
ketchley_wenig_CPS <- read_dta("ketchley_wenig_CPS.dta")

#Subset to first instance in panel
subset_data <- ketchley_wenig_CPS %>%
  group_by(person_id) %>%
  slice(1) %>%
  ungroup()

#Create a vector containing only the text
text <- subset_data$job_title

# Create a corpus  
docs <- Corpus(VectorSource(text))

docs <- tm_map(docs, removeWords, stopwords("english"))

docs <- docs %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers)

dtm <- TermDocumentMatrix(docs) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)

findFreqTerms(dtm, lowfreq = 4)
head(df, 10)

#Figure A3
barplot(df[1:10,]$freq, las = 2, names.arg = df[1:10,]$word,
        col ="lightblue", main ="Most frequent words in job titles",
        ylim = c(0,140), ylab = "Word frequencies", cex.names = 0.7)

#png("figures/figA3", width = 800, height = 600, units = "px", res = 600)


#Table A1
findAssocs(dtm, terms = "director", corlimit = 0.25)
findAssocs(dtm, terms = "secretary", corlimit = 0.25)
findAssocs(dtm, terms = "general", corlimit = 0.25)
findAssocs(dtm, terms = "state", corlimit = 0.25)
findAssocs(dtm, terms = "attache", corlimit = 0.25)
findAssocs(dtm, terms = "plenipotentiary", corlimit = 0.25)
findAssocs(dtm, terms = "minister", corlimit = 0.25)
findAssocs(dtm, terms = "extraordinary", corlimit = 0.25)
findAssocs(dtm, terms = "undersecretary", corlimit = 0.25)
findAssocs(dtm, terms = "section", corlimit = 0.25)





